#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
#          Validate proper epilog and prolog child process management.
#
#
# Output:  "TEST: #.#" followed by "SUCCESS" if test was successful, OR
#          "FAILURE: ..." otherwise with an explanation of the failure, OR
#          anything else indicates a failure mode that must be investigated.
############################################################################
# Copyright (C) 2013 SchedMD LLC
# Written by Nathan Yee <nyee32@schedmd.com>
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
############################################################################
source ./globals

set test_id          "31.1"
set test_name        "test31.1"
set config_dir       ""
set conf_path        "$config_dir/slurm.conf"
set inc_slurm        "pro_epi_$test_name"
set pro_epi_prog     "$test_name\_prog"
set proc             ""
set run_host         ""
set make_file        0
set exit_code        0
set pcmd_prog        "/opt/cray/nodehealth/default/bin/pcmd"

print_header $test_id

if {[test_cray]} {
	set make_file 1
	if { [file exists $pcmd_prog] == 0 } {
		send_user "\nWARNING: The file $pcmd_prog was not found\n"
		exit $exit_code
	}
}
if { [string compare [test_launch_type] "slurm"] } {
	send_user "\nWARNING: This test is only compatible with systems using launch/slurm\n"
	exit $exit_code
}
set timeout $max_job_delay

proc change_pro_epi { arg program_name conf_change task_pro } {

	global inc_slurm bin_chmod bin_rm config_dir pro_epi_prog bin_echo test_name bin_pwd alpha_numeric_under make_file bin_bash exit_code cwd file_dir bin_ln

	if {$conf_change == 0} {
		exec $bin_echo $arg=$config_dir/$pro_epi_prog > $config_dir/$inc_slurm
	}

	set make_ln 0
	exec $bin_rm -f $cwd/$program_name
	spawn $bin_bash -c "$bin_ln -sv /bin/sleep $cwd/$program_name"
	expect {
		-re "$program_name" {
			set make_ln 1
			exp_continue
		}
		timeout {
			send_user "\nFAILURE: There was a problem creating the link\n"
			set exit_code 1
		}
		eof {
			wait
		}
	}

	if {$make_ln != 1} {
		send_user "\nFAILURE: link was not created\n"
		set exit_code 1
	}

	exec $bin_echo "#!$bin_bash\n" > $config_dir/$pro_epi_prog
	if {$make_file == 1} {
		exec $bin_echo "touch $file_dir/$arg.$test_name.txt" >> $config_dir/$pro_epi_prog
	}

	# task prologs are ran with fork_wait so don't do the sleep
	if {!$task_pro} {
		exec $bin_echo "$cwd/$program_name 1000&" >> $config_dir/$pro_epi_prog
	}
	exec $bin_echo "exit 0" >> $config_dir/$pro_epi_prog

	exec $bin_chmod 700 $config_dir/$pro_epi_prog
}

proc check_file { file_name } {
	global test_name exit_code file_dir run_host pcmd_prog

	sleep 1

	set found 0
	spawn $pcmd_prog -n $run_host "find $file_dir -name $file_name"
	expect {
		-re "$file_dir/$file_name" {
			set found 1
			exp_continue
		}
		timeout {
			send_user "\nFAILURE: find is not responding\n"
			set exit_code 1
		}
		eof {
			wait
		}
	}

	if {$found == 1} {
		send_user "\nFAILURE: The file $file_name should have been removed\n"
		# Root may of created this file so we can't remove it
		# exec $bin_rm -f $file_dir/$file_name
		set exit_code 1
	}
}

proc check_proc {program_name} {

	global bin_ps bin_bash bin_grep exit_code

	set proc_exist 0
	spawn $bin_bash -c "exec $bin_ps -ef | $bin_grep $program_name"
	expect {
		-re "$program_name 1000" {
			set proc_exist 1
			exp_continue
		}
		timeout {
			send_user "\nFAILURE: timed out while looking for process\n"
			set exit_code 1
		}
		eof {
			wait
		}
	}
	if {$proc_exist == 1} {
		send_user "\nFAILURE: $program_name should have been killed\n"
		exec killall -9 $program_name
		set exit_code 1
	}
}

proc run_job { file_type program_name conf_change task_pro} {

	global srun run_host make_file exit_code bin_rm bin_printenv bin_sleep cwd test_name

	# Change the included file to epilog in the slurm.conf
	change_pro_epi $file_type $program_name $conf_change $task_pro

	# Wait for NFS file propagation as needed
	$bin_sleep 10

	# Update config file
	reconfigure

	set sub_job 0
	spawn $srun -t1 -w$run_host $bin_printenv SLURMD_NODENAME
	expect {
		-re "$run_host" {
			set sub_job 1
			exp_continue
		}
		timeout {
			send_user "\nFAILURE: srun is not responding\n"
			set exit_code 1
		}
		eof {
			wait
		}
	}

	if {$sub_job != 1} {
		send_user "\nFAILURE: srun did not start job\n"
		set exit_code 1
	}

	# wait for file
	if {$make_file == 1} {
		# Check to see that the epilog or prolog file has been removed
		check_file $file_type.$test_name.txt
	}

	if {$task_pro != 1} {
		# Check to see if the child process still exist
		check_proc $program_name
	}

	# Remove link
	exec $bin_rm -rf $cwd/$program_name
}


if { [test_super_user] == 0 } {
	send_user "WARNING: Test can only be run as SlurmUser\n"
	exit $exit_code
}
if { [test_front_end] == 1 } {
	send_user "WARNING: Test can not be run with front-end configuration\n"
	exit $exit_code
}

log_user 0
set got_config 0
spawn $scontrol show config
expect {
	-re "SLURM_CONF.*= (/.*)/($alpha).*SLURM_VERSION" {
		set config_dir $expect_out(1,string)
		set got_config 1
		exp_continue
	}
	timeout {
		send_user "\nFAILURE: scontrol is not responding\n"
		set exit_code 1
	}
	eof {
		wait
	}
}
log_user 1
if {$got_config == 0} {
	send_user "\nFAILURE: Could not identify slurm.conf location\n"
	exit 1
}

set get_name 0
spawn $srun -t1 -l $bin_printenv SLURMD_NODENAME
expect {
	-re "0: *($alpha_numeric_under)" {
		set run_host $expect_out(1,string)
		set get_name 1
		exp_continue
	}
	timeout {
		send_user "\nFAILURE: Did not get run_host\n"
		set exit_code 1
	}
	eof {
		wait
	}
}

set cwd "[$bin_pwd]"
set file_dir "/tmp"
if {$get_name != 1} {
	send_user "\nFAILURE: Did not get run_host\n"
	exit 1
}

#
# Copy slurm.conf file
#
exec $bin_rm -fr $cwd/slurm.conf.orig
spawn $bin_cp -v $config_dir/slurm.conf $cwd/slurm.conf.orig
expect {
	timeout {
		send_user "\nFAILURE: slurm.conf was not copied\n"
		set exit_code 1
	}
	eof {
		wait
	}
}

#
# Test srun --task-epilog
#
change_pro_epi srun-task-epilog srun--task-epilog_$test_name 1 0

reconfigure

set sub_job 0
set invalid_node 0
spawn $srun -t1 -w$run_host --task-epilog=$config_dir/$pro_epi_prog $bin_printenv SLURMD_NODENAME
expect {
	-re "$run_host" {
		set sub_job 1
		exp_continue
	}
	timeout {
		send_user "\nFAILURE: srun is not responding\n"
		set exit_code 1
	}
	eof {
		wait
	}
}
if {$sub_job != 1} {
	send_user "\nFAILURE: srun did not submit job\n"
	set exit_code 1
}

if {$make_file == 1} {
	check_file srun-task-epilog.$test_name.txt
}


# Check to see if the child process still exist
check_proc srun--task-epilog_$test_name

# Remove link
exec $bin_rm -rf $cwd/srun--task-epilog_$test_name

#
# Task Prolog Test
#
# NOTE: This checks to see if this test is running on a Cray system. When it is
# on a Cray system it will run the user invoked srun --task-prolog and the
# slurm.conf taskprolog. Both these prologs waits for the child process to
# finish before finishing the job. In order to test the task prologs we leave
# out the child process and only check for the created file.
#

if {$make_file == 1} {

	# Test srun --task-prolog
	change_pro_epi srun-task-prolog srun--task-prolog_$test_name 1 1

	reconfigure

	set sub_job 0
	set sbcast_file      "$file_dir/${test_name}_sbcast"

	spawn $salloc -t1 -w$run_host $bin_bash
	expect {
		-re "Granted job allocation ($number)" {
			send "$sbcast --force $config_dir/$pro_epi_prog $sbcast_file\n"
			send "$srun -t1 --task-prolog=$config_dir/$pro_epi_prog $bin_printenv SLURMD_NODENAME\n"
			send "exit\n"
			exp_continue
		}
		-re "$run_host" {
			set sub_job 1
			exp_continue
		}
		timeout {
			send_user "\nFAILURE: salloc is not responding\n"
			set exit_code 1
		}
		eof {
			wait
		}
	}
	if {$sub_job != 1} {
		send_user "\nFAILURE: srun did not submit job\n"
		set exit_code 1
	}

	check_file ${test_name}_sbcast

	check_file srun-task-prolog.$test_name.txt

	# Remove link
	exec $bin_rm -rf $cwd/srun--task-prolog_$test_name

	# Add a line in the slurm.conf file to include the epilogs and prologs
	exec $bin_echo include $config_dir/$inc_slurm >> $config_dir/slurm.conf

	#
	# Test TaskProlog
	#
	run_job taskprolog taskprolog_$test_name 0 1

} else {
	# Add a line in the slurm.conf file to include the epilogs and prologs
	exec $bin_echo include $config_dir/$inc_slurm >> $config_dir/slurm.conf
}

###############Epilog Test###############

#
# Test Epilog param
#
run_job epilog epilog_$test_name 0 0

#
# Test TaskEpilog param
#
run_job taskepilog taskepilog_$test_name 0 0

###############Prolog Test###############

#
# Test Prolog param
#
run_job prolog prolog_$test_name 0 0

# Clean up vestigial files and restore original slurm.conf file
exec $bin_cp $cwd/slurm.conf.orig $config_dir/slurm.conf
reconfigure

if {$exit_code == 0} {
	exec $bin_rm -fr $config_dir/$inc_slurm $config_dir/$pro_epi_prog $cwd/slurm.conf.orig
	send_user "\nSUCCESS\n"
}
exit $exit_code
